from langchain_text_splitters import  HTMLHeaderTextSplitter


with open("./test.html", encoding="utf-8") as f:
    read_string = f.read()
    headers_to_spliter_on = [
        ("h1", "一级标题"),
        ("h2", "二级标题"),
        ("h3", "三级标题")
    ]
    headers_to_spliter = HTMLHeaderTextSplitter(headers_to_spliter_on)
    chucks = headers_to_spliter.split_text(read_string)

    for one in chucks:
        print(one)